outliers <- read_tsv("../input_data/druggable_outliers_from_treehouse_and_other_cohorts_2023_11_09-13_46_32_2023.tsv") %>%
  mutate(high_level_cohort = ifelse(str_detect(comparison_cohort, "Treehouse"),
                                    "Treehouse",
                                    comparison_cohort))
## Rows: 287 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): Sample_ID, comparison_cohort, gene, donor_ID
## lgl (1): pathway_support
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

COMPARE DISTRIBUTIONS FOR FOR OUTLIERS ACROSS COHORTS

outlier_genes_detected <- unique(outliers$gene)

expr <- read_tsv("../input_data/druggable_TumorCompendium_v11_PolyA_hugo_log2tpm_58581genes_2020-04-09.tsv.gz") %>%
  rename(Sample_ID = TH_id) %>%
  filter(Gene %in% outlier_genes_detected)
## Rows: 1414917 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): Gene, TH_id
## dbl (1): log2TPM1
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
stanford_samples  <- read_tsv("../gather_input_data/comparison_to_non_CARE_cohorts/data/TH03_TH34_rollup.sample_list.txt",
                              col_names = "Sample_ID") %>%
  mutate(cohort = "Stanford")
## Rows: 110 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): Sample_ID
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
TCGA_samples  <- read_tsv("../gather_input_data/comparison_to_non_CARE_cohorts/data/TCGA_rollup.sample_list.txt",
                              col_names = "Sample_ID") %>%
  mutate(cohort = "TCGA")
## Rows: 9806 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): Sample_ID
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
PEDAYA_samples  <- read_tsv("../gather_input_data/comparison_to_non_CARE_cohorts/data/PEDAYA_rollup.sample_list.txt",
                              col_names = "Sample_ID") %>%
  mutate(cohort = "PEDAYA")
## Rows: 2814 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): Sample_ID
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pan_cancer_samples <- expr %>%
  select(Sample_ID) %>%
  distinct() %>%
  mutate(cohort = "Treehouse_pancancer")


samples_in_cohorts <- bind_rows(
  stanford_samples,
  TCGA_samples,
  PEDAYA_samples,
  pan_cancer_samples)


tabyl(samples_in_cohorts,
      cohort)
##               cohort     n    percent
##               PEDAYA  2814 0.11045257
##             Stanford   110 0.00431762
##                 TCGA  9806 0.38489618
##  Treehouse_pancancer 12747 0.50033363

expression in samples not in the compendium

rsem_path <- "../input_data/non_compendium_expression"

gene_name_conversion <- read_tsv(file.path(rsem_path,
                                           "EnsGeneID_Hugo_Observed_Conversions.txt"))
## Rows: 60498 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): HugoID, EnsGeneID
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
relevant_gene_name_conversion <- gene_name_conversion %>%
  filter(HugoID %in% outlier_genes_detected)

rsem_kitchen_sink_data <- tibble(file_name = list.files(
  path = rsem_path,
  pattern = "_rsem_genes.results")) %>%
  rowwise() %>%
  mutate(rsem_raw = list(read_tsv(file.path(rsem_path, file_name),
                                     show_col_types = FALSE
                                     ))) %>%
  unnest(rsem_raw) %>%
  filter(gene_id %in% relevant_gene_name_conversion$EnsGeneID) %>%
  mutate(Sample_ID = str_extract(file_name, "TH[R]?[0-9]{2}_[0-9]{4}_S[0-9]{2}")) %>%
  left_join(relevant_gene_name_conversion, 
            by=c("gene_id"="EnsGeneID")) %>%
    group_by(Sample_ID, HugoID) %>%
    summarize(sum_TPM = sum(TPM),
              n=n()) %>%
    mutate(log2TPM1 = log2(sum_TPM +1))
## `summarise()` has grouped output by 'Sample_ID'. You can override using the
## `.groups` argument.
table(rsem_kitchen_sink_data$n)
## 
##   1   2 
## 275   5
patient_expression_from_rsem_files <- rsem_kitchen_sink_data %>%
  select(gene = HugoID,
         log2TPM1,
         Sample_ID)

patient_expression_from_compendia <- outliers %>%
  select(Sample_ID, gene) %>%
  distinct() %>%
  left_join(expr, 
            by=c("Sample_ID", "gene"="Gene"))

patient_expression <- bind_rows(
  patient_expression_from_rsem_files,
  patient_expression_from_compendia)
  
length(outlier_genes_detected)
## [1] 56
outliers$Sample_ID[ ! outliers$Sample_ID %in% expr$Sample_ID] %>% unique()
## [1] "TH34_1400_S01" "TH34_2292_S01" "TH34_2666_S01" "TH34_1445_S02"
## [5] "TH34_1456_S02"

outliers

patient_expression
## # A tibble: 410 × 3
## # Groups:   Sample_ID [34]
##    gene  log2TPM1 Sample_ID    
##    <chr>    <dbl> <chr>        
##  1 AKT1     6.41  TH34_1400_S01
##  2 AKT2     7.55  TH34_1400_S01
##  3 ALK      0.791 TH34_1400_S01
##  4 BCL6     6.68  TH34_1400_S01
##  5 BTK      2.09  TH34_1400_S01
##  6 CCND1    5.10  TH34_1400_S01
##  7 CCND2    3.35  TH34_1400_S01
##  8 CCND3    4.52  TH34_1400_S01
##  9 CCNE1    1.17  TH34_1400_S01
## 10 CDK4     5.63  TH34_1400_S01
## # ℹ 400 more rows
this_gene <- "FGFR4"

one_gene_expr_per_cohort <- bind_rows(
  expr %>%
    filter(Gene == this_gene,
           Sample_ID %in% stanford_samples$Sample_ID) %>%
    mutate(cohort = "Stanford"),
  expr %>%
    filter(Gene == this_gene,
           Sample_ID %in% TCGA_samples$Sample_ID) %>%
    mutate(cohort = "TCGA"),
  expr %>%
    filter(Gene == this_gene,
           Sample_ID %in% PEDAYA_samples$Sample_ID) %>%
    mutate(cohort = "PEDAYA"),
  expr %>%
    filter(Gene == this_gene,
           Sample_ID %in% pan_cancer_samples$Sample_ID) %>%
    mutate(cohort = "pan_cancer"))
# How many colors to i need

outliers %>%
  group_by(gene) %>%
  summarize(n_samples = length(unique(Sample_ID))) %>%
  arrange(desc(n_samples))
## # A tibble: 56 × 2
##    gene  n_samples
##    <chr>     <int>
##  1 IGF2         18
##  2 HMOX1         8
##  3 NTRK2         7
##  4 FGFR4         5
##  5 ETV1          4
##  6 NTRK3         4
##  7 BTK           3
##  8 CDK9          3
##  9 FGFR1         3
## 10 FLT4          3
## # ℹ 46 more rows
library(khroma)

lapply(outlier_genes_detected, function(this_gene){
  # this_gene <- "PIK3R5"
  relevant_patient_expression <- patient_expression %>%
    filter(gene == this_gene) %>%
    filter(Sample_ID %in% (outliers %>%
                             filter(gene == this_gene) %>% 
                             pull(Sample_ID)))
                           
  
  one_gene_expr_per_cohort <- left_join(samples_in_cohorts,
                                        expr %>%
                                          filter(Gene == this_gene))
  
  p1 <- ggplot(one_gene_expr_per_cohort) +
    geom_histogram(aes(x=log2TPM1)) +
    geom_vline(data = relevant_patient_expression,
               aes(xintercept = log2TPM1,
                   color = Sample_ID)) +
    scale_color_brewer(palette = "Set1") +
    facet_col(~cohort, scales = "free_y") +
  ggtitle(this_gene)
  
  p2 <- ggplot(one_gene_expr_per_cohort) +
    geom_boxplot(aes(x=log2TPM1)) +
    geom_vline(data = relevant_patient_expression,
               aes(xintercept = log2TPM1,
                   color = Sample_ID)) +
    scale_color_brewer(palette = "Set1") +
    facet_col(~cohort) 
  
  
  outlier_table <- outliers %>%
    filter(gene == this_gene) %>%
    select(Sample_ID, gene, comparison_cohort) %>%
    mutate(found = TRUE) %>%
    pivot_wider(names_from = comparison_cohort,
                values_from = found)
  
  t1 <- tableGrob(outlier_table, theme=ttheme_minimal(), rows=NULL)  # transform into a tableGrob
  
  cowplot::plot_grid(p1, p2, t1,
                     ncol = 1)
})
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 8 rows containing missing values (`geom_vline()`).
## Removed 8 rows containing missing values (`geom_vline()`).
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (`geom_vline()`).
## Warning: Removed 4 rows containing missing values (`geom_vline()`).
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 8 rows containing missing values (`geom_vline()`).
## Warning: Removed 8 rows containing missing values (`geom_vline()`).
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (`geom_vline()`).
## Warning: Removed 4 rows containing missing values (`geom_vline()`).
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (`geom_vline()`).
## Removed 4 rows containing missing values (`geom_vline()`).
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(n, pal): Removed 4 rows containing missing values (`geom_vline()`).
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
## Warning: Removed 4 rows containing missing values (`geom_vline()`).
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (`geom_vline()`).
## Removed 4 rows containing missing values (`geom_vline()`).
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 8 rows containing missing values (`geom_vline()`).
## Warning: Removed 8 rows containing missing values (`geom_vline()`).
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 8 rows containing missing values (`geom_vline()`).
## Removed 8 rows containing missing values (`geom_vline()`).
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (`geom_vline()`).
## Warning: Removed 4 rows containing missing values (`geom_vline()`).
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (`geom_vline()`).
## Removed 4 rows containing missing values (`geom_vline()`).
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (`geom_vline()`).
## Removed 4 rows containing missing values (`geom_vline()`).
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 8 rows containing missing values (`geom_vline()`).
## Warning: Removed 8 rows containing missing values (`geom_vline()`).
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (`geom_vline()`).
## Warning: Removed 4 rows containing missing values (`geom_vline()`).
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (`geom_vline()`).
## Removed 4 rows containing missing values (`geom_vline()`).
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

## 
## [[16]]

## 
## [[17]]

## 
## [[18]]

## 
## [[19]]

## 
## [[20]]

## 
## [[21]]

## 
## [[22]]

## 
## [[23]]

## 
## [[24]]

## 
## [[25]]

## 
## [[26]]

## 
## [[27]]

## 
## [[28]]

## 
## [[29]]

## 
## [[30]]

## 
## [[31]]

## 
## [[32]]

## 
## [[33]]

## 
## [[34]]

## 
## [[35]]

## 
## [[36]]

## 
## [[37]]

## 
## [[38]]

## 
## [[39]]

## 
## [[40]]

## 
## [[41]]

## 
## [[42]]

## 
## [[43]]

## 
## [[44]]

## 
## [[45]]

## 
## [[46]]

## 
## [[47]]

## 
## [[48]]

## 
## [[49]]

## 
## [[50]]

## 
## [[51]]

## 
## [[52]]

## 
## [[53]]

## 
## [[54]]

## 
## [[55]]

## 
## [[56]]